logo


Data cleaning

Contracts data

ma<-read.csv("ma.csv", sep=";", quote = "")
head(ma)
##   X900 X000010000010 X1301047128       Person X00.00.0000 X00.00.0000.1 X.1 X.2
## 1  900      10000159  1301001838 Organization  00.00.0000    00.00.0000  NA    
## 2  900      10000206  1301048279       Person  00.00.0000    00.00.0000  NA    
## 3  900      10000230  1301010418 Organization  00.00.0000    00.00.0000  NA    
## 4  900      10000264  1301003500 Organization  00.00.0000    00.00.0000  NA    
## 5  900      10000280  1301004205 Organization  00.00.0000    00.00.0000  NA    
## 6  900      10000311  1301023370       Person  00.00.0000    00.00.0000  NA    
##   X.3 X.4 X00.00.0000.2 X00.00.0000.3 X.5 X.6 X.7 X.8 X.9 Collection.Strategy.1
## 1  NA  NA    00.00.0000    00.00.0000      NA      NA     Collection Strategy 1
## 2  NA  NA    00.00.0000    00.00.0000      NA      NA     Collection Strategy 1
## 3  NA  NA    00.00.0000    00.00.0000      NA      NA     Collection Strategy 1
## 4  NA  NA    00.00.0000    00.00.0000      NA      NA     Collection Strategy 1
## 5  NA  NA    00.00.0000    00.00.0000      NA      NA     Collection Strategy 1
## 6  NA  NA    00.00.0000    00.00.0000      NA      NA     Collection Strategy 1
##   X19.12.2017 X00 ... X0.00 X00.1 X00.00.0000.4 X.10 X.11 X.12 X.13 X.14 X.15
## 1  19.12.2017   0 . .  0,00     0    00.00.0000        NA   NA   NA   NA   NA
## 2  19.12.2017   0 . .  0,00     0    00.00.0000        NA   NA   NA   NA   NA
## 3  19.12.2017   0 . .  0,00     0    00.00.0000        NA   NA   NA   NA   NA
## 4  19.12.2017   0 . .  0,00     0    00.00.0000        NA   NA   NA   NA   NA
## 5  19.12.2017   0 . .  0,00     0    00.00.0000        NA   NA   NA   NA   NA
## 6  19.12.2017   0 . .  0,00     0    00.00.0000        NA   NA   NA   NA   NA
##   X.16 X.17 X.18 X.19 X6027 PP          PATER.EUWENSWEG   ZN
## 1         X            6010 PP             RIALTOSTRAAT    2
## 2                      6010 PP             RIALTOSTRAAT    6
## 3         X            6027 PP GOUVERNEUR VAN SLOBBEWEG   ZN
## 4                      6005 IB               MOLENPLEIN 19 A
## 5         X            6005 PP               MOLENPLEIN  19B
## 6                      6099 PP                    PARIS  15A
##          PATER.EUWENSWEG.1 X00.2 X
## 1             RIALTOSTRAAT     0  
## 2             RIALTOSTRAAT     0 X
## 3 GOUVERNEUR VAN SLOBBEWEG     0 X
## 4               MOLENPLEIN     0  
## 5               MOLENPLEIN     0  
## 6                    PARIS     0

Headers missing!!! Need to add the column names manually.

rm(ma) # Removing ma to save memory
 contracts <- read.csv("ma.csv", sep=";", dec=",", col.names = c(
 "sap_client",
 "contract_account",
 "business_partner",
 "bp_category",
 "bp_org_start",
 "bp_liquidation",
 "bp_title",
 "bp_gender",
 "bp_marital",
 "bp_job",
 "bp_dob",
 "bp_dod",
 "bp_nationality",
 "bp_legalform",
 "bp_legalentity",
 "bp_industry",
 "bp_accclass",
 "bp_collecstrategy",
 "record_date",
 "additional_days_cash_management",
 "process_lock_till",
 "direct_debit_limit",
 "no_of_months_debit_limit",
 "invoice_cycle_start",
 "incoming_payment_method",
 "lock_reason_incoming_payment",
 "dunning_procedure",
 "corresp_dunning_pro",
 "bp_lock_dunning",
 "penalty_waiver_code",
 "special_handling_code",
 "bp_phone",
 "bp_email",
 "bp_comm_method",
 "building_no",
 "room_no",
 "street",
 "house_no",
 "street_upper_case",
 "region",
 "active"
  ), quote = "", header = F)
 
 head(contracts)
##   sap_client contract_account business_partner  bp_category bp_org_start
## 1        900         10000010       1301047128       Person   00.00.0000
## 2        900         10000159       1301001838 Organization   00.00.0000
## 3        900         10000206       1301048279       Person   00.00.0000
## 4        900         10000230       1301010418 Organization   00.00.0000
## 5        900         10000264       1301003500 Organization   00.00.0000
## 6        900         10000280       1301004205 Organization   00.00.0000
##   bp_liquidation bp_title bp_gender bp_marital bp_job     bp_dob     bp_dod
## 1     00.00.0000       NA                   NA     NA 00.00.0000 00.00.0000
## 2     00.00.0000       NA                   NA     NA 00.00.0000 00.00.0000
## 3     00.00.0000       NA                   NA     NA 00.00.0000 00.00.0000
## 4     00.00.0000       NA                   NA     NA 00.00.0000 00.00.0000
## 5     00.00.0000       NA                   NA     NA 00.00.0000 00.00.0000
## 6     00.00.0000       NA                   NA     NA 00.00.0000 00.00.0000
##   bp_nationality bp_legalform bp_legalentity bp_industry bp_accclass
## 1                          NA                         NA            
## 2                          NA                         NA            
## 3                          NA                         NA            
## 4                          NA                         NA            
## 5                          NA                         NA            
## 6                          NA                         NA            
##       bp_collecstrategy record_date additional_days_cash_management
## 1 Collection Strategy 1  19.12.2017                               0
## 2 Collection Strategy 1  19.12.2017                               0
## 3 Collection Strategy 1  19.12.2017                               0
## 4 Collection Strategy 1  19.12.2017                               0
## 5 Collection Strategy 1  19.12.2017                               0
## 6 Collection Strategy 1  19.12.2017                               0
##   process_lock_till direct_debit_limit no_of_months_debit_limit
## 1               . .                  0                        0
## 2               . .                  0                        0
## 3               . .                  0                        0
## 4               . .                  0                        0
## 5               . .                  0                        0
## 6               . .                  0                        0
##   invoice_cycle_start incoming_payment_method lock_reason_incoming_payment
## 1          00.00.0000                                                   NA
## 2          00.00.0000                                                   NA
## 3          00.00.0000                                                   NA
## 4          00.00.0000                                                   NA
## 5          00.00.0000                                                   NA
## 6          00.00.0000                                                   NA
##   dunning_procedure corresp_dunning_pro bp_lock_dunning penalty_waiver_code
## 1                NA                  NA              NA                  NA
## 2                NA                  NA              NA                  NA
## 3                NA                  NA              NA                  NA
## 4                NA                  NA              NA                  NA
## 5                NA                  NA              NA                  NA
## 6                NA                  NA              NA                  NA
##   special_handling_code bp_phone bp_email bp_comm_method building_no room_no
## 1                                                               6027      PP
## 2                              X                                6010      PP
## 3                                                               6010      PP
## 4                              X                                6027      PP
## 5                                                               6005      IB
## 6                              X                                6005      PP
##                     street house_no        street_upper_case region active
## 1          PATER EUWENSWEG       ZN          PATER EUWENSWEG      0      X
## 2             RIALTOSTRAAT        2             RIALTOSTRAAT      0       
## 3             RIALTOSTRAAT        6             RIALTOSTRAAT      0      X
## 4 GOUVERNEUR VAN SLOBBEWEG       ZN GOUVERNEUR VAN SLOBBEWEG      0      X
## 5               MOLENPLEIN     19 A               MOLENPLEIN      0       
## 6               MOLENPLEIN      19B               MOLENPLEIN      0

Several formatting issues observed. Will note them down as I reformat the columns.

 contracts<-
 contracts%>%
   mutate(bp_category=as.factor(bp_category))%>%       #Business partner should be a factor
   mutate(bp_org_start=as.POSIXct(bp_category,tz="America/Curacao",format="%d.%m.%Y"))%>% #Start date should be a date (and not character)
   mutate(business_partner=as.character(business_partner))%>%  # Business partner ID should be a character
   mutate(contract_account=as.character(contract_account))%>% # Same with contract id
   mutate(bp_liquidation=as.POSIXct(bp_liquidation,tz="America/Curacao",format="%d.%m.%Y"))%>% # liquidation date should be date
   mutate(bp_title=as.factor(bp_title))%>% # Business partner title should be a factor
   mutate(bp_gender=as.factor(bp_gender))%>% # Gender should be a factor
   mutate(bp_marital=as.factor(bp_marital))%>% # Marital status should be factor
   mutate(bp_job=as.character(bp_job))%>%  # Job should be a character
   mutate(bp_dob=as.POSIXct(bp_dob,tz="America/Curacao",format="%d.%m.%Y"))%>% #Date of birth should be a date
   mutate(bp_dod=as.POSIXct(bp_dod,tz="America/Curacao",format="%d.%m.%Y"))%>% #Date of demise should be a date
   mutate(bp_nationality=as.character(bp_nationality))%>% # Nationality should be a character
   mutate(bp_legalform=as.factor(bp_legalform))%>% # Legal form should be a factor
   mutate(bp_legalentity=as.character(bp_legalentity))%>% # Legal entity should be a factor
   mutate(bp_industry=as.factor(bp_industry))%>% # Industry should be a factor
   mutate(bp_accclass=as.factor(bp_accclass))%>% # Class of account should be factor
   mutate(bp_collecstrategy=as.factor(bp_collecstrategy))%>% # Collection strategy should be a factor
   mutate(record_date=as.POSIXct(record_date,tz="America/Curacao",format="%d.%m.%Y"))%>% # Date should be date
   mutate(additional_days_cash_management=as.integer(additional_days_cash_management))%>% # Additional days shuould be an integer, since it is no. of days
   mutate(direct_debit_limit=as.numeric(direct_debit_limit))%>% # Debit limit should be numeric
   mutate(process_lock_till=as.POSIXct(process_lock_till,tz="America/Curacao",
                                       format="%d.%m.%Y"))%>% # Until indicates date or time. Here date.
   mutate(no_of_months_debit_limit=as.numeric(no_of_months_debit_limit))%>% # number of months should be numeric
   mutate(invoice_cycle_start=as.POSIXct(invoice_cycle_start,tz="America/Curacao",
                                         format="%d.%m.%Y"))%>% # Invoice cycle start date should be date
   mutate(incoming_payment_method=as.factor(incoming_payment_method))%>% # Incoming payment method should be factor
   mutate(lock_reason_incoming_payment=as.factor(lock_reason_incoming_payment))%>% # Lock reason should be factor
   mutate(dunning_procedure=as.factor(dunning_procedure))%>% # Dunning procedure should be factor
   mutate(corresp_dunning_pro=as.factor(corresp_dunning_pro))%>% # Should be factor
   mutate(bp_lock_dunning=as.character(bp_lock_dunning))%>% # Should be character
   mutate(penalty_waiver_code=as.character(penalty_waiver_code))%>% # should be factor/character
   mutate(special_handling_code=as.character(special_handling_code))%>% # Should be character/factor
   mutate(bp_phone=as.character(bp_phone))%>% # Needs to be in character else will show in decimals etc.
   mutate(bp_email=as.character(bp_email))%>% # Should be character
   mutate(bp_comm_method=as.factor(bp_comm_method))%>% # Should be factor
   mutate(building_no=as.character(building_no))%>% # Should be character
   mutate(room_no=as.factor(room_no))%>%  # using factor because this column is used to identify area, confirmed by Rigo
   mutate(street=as.character(street))%>% # Should be as character
   mutate(house_no=as.character(house_no))%>% # Should be character
   mutate(street_upper_case=as.character(street_upper_case))%>% # Should be character
   mutate(region=as.factor(region))%>% # Should be factor
   mutate(active=ifelse(active=="X",T,F)) # Changed to logical using T and F instead of X

head(contracts)
##   sap_client contract_account business_partner  bp_category bp_org_start
## 1        900         10000010       1301047128       Person         <NA>
## 2        900         10000159       1301001838 Organization         <NA>
## 3        900         10000206       1301048279       Person         <NA>
## 4        900         10000230       1301010418 Organization         <NA>
## 5        900         10000264       1301003500 Organization         <NA>
## 6        900         10000280       1301004205 Organization         <NA>
##   bp_liquidation bp_title bp_gender bp_marital bp_job bp_dob bp_dod
## 1           <NA>     <NA>                 <NA>   <NA>   <NA>   <NA>
## 2           <NA>     <NA>                 <NA>   <NA>   <NA>   <NA>
## 3           <NA>     <NA>                 <NA>   <NA>   <NA>   <NA>
## 4           <NA>     <NA>                 <NA>   <NA>   <NA>   <NA>
## 5           <NA>     <NA>                 <NA>   <NA>   <NA>   <NA>
## 6           <NA>     <NA>                 <NA>   <NA>   <NA>   <NA>
##   bp_nationality bp_legalform bp_legalentity bp_industry bp_accclass
## 1                        <NA>                       <NA>            
## 2                        <NA>                       <NA>            
## 3                        <NA>                       <NA>            
## 4                        <NA>                       <NA>            
## 5                        <NA>                       <NA>            
## 6                        <NA>                       <NA>            
##       bp_collecstrategy record_date additional_days_cash_management
## 1 Collection Strategy 1  2017-12-19                               0
## 2 Collection Strategy 1  2017-12-19                               0
## 3 Collection Strategy 1  2017-12-19                               0
## 4 Collection Strategy 1  2017-12-19                               0
## 5 Collection Strategy 1  2017-12-19                               0
## 6 Collection Strategy 1  2017-12-19                               0
##   process_lock_till direct_debit_limit no_of_months_debit_limit
## 1              <NA>                  0                        0
## 2              <NA>                  0                        0
## 3              <NA>                  0                        0
## 4              <NA>                  0                        0
## 5              <NA>                  0                        0
## 6              <NA>                  0                        0
##   invoice_cycle_start incoming_payment_method lock_reason_incoming_payment
## 1                <NA>                                                 <NA>
## 2                <NA>                                                 <NA>
## 3                <NA>                                                 <NA>
## 4                <NA>                                                 <NA>
## 5                <NA>                                                 <NA>
## 6                <NA>                                                 <NA>
##   dunning_procedure corresp_dunning_pro bp_lock_dunning penalty_waiver_code
## 1              <NA>                <NA>            <NA>                <NA>
## 2              <NA>                <NA>            <NA>                <NA>
## 3              <NA>                <NA>            <NA>                <NA>
## 4              <NA>                <NA>            <NA>                <NA>
## 5              <NA>                <NA>            <NA>                <NA>
## 6              <NA>                <NA>            <NA>                <NA>
##   special_handling_code bp_phone bp_email bp_comm_method building_no room_no
## 1                                                               6027      PP
## 2                              X                                6010      PP
## 3                                                               6010      PP
## 4                              X                                6027      PP
## 5                                                               6005      IB
## 6                              X                                6005      PP
##                     street house_no        street_upper_case region active
## 1          PATER EUWENSWEG       ZN          PATER EUWENSWEG      0   TRUE
## 2             RIALTOSTRAAT        2             RIALTOSTRAAT      0  FALSE
## 3             RIALTOSTRAAT        6             RIALTOSTRAAT      0   TRUE
## 4 GOUVERNEUR VAN SLOBBEWEG       ZN GOUVERNEUR VAN SLOBBEWEG      0   TRUE
## 5               MOLENPLEIN     19 A               MOLENPLEIN      0  FALSE
## 6               MOLENPLEIN      19B               MOLENPLEIN      0  FALSE

Open items

op<-read.csv("op.csv", sep=";")
head(op)
##   X900 X000012648521 X6000 X0031 X01 X10 X X21.01.2016 X02.01.2018 ANG
## 1  900      10992283  6000  0031   1  10    21.01.2016  02.01.2018 ANG
## 2  900      12762422  6000  0031   1  10    21.01.2016  02.01.2018 ANG
## 3  900      10984492  6000  0031   1  10    21.01.2016  02.01.2018 ANG
## 4  900      10976986  6000  0031   1   1    21.01.2016  02.01.2018 ANG
## 5  900      11008621  6000  0031   2  10    21.01.2016  02.01.2018 ANG
## 6  900      12915009  6000  0031   1  10    21.01.2016  02.01.2018 ANG
##   X19.02.2016 X.1 X20.00 X.2 X00.00.0000 X00.00.0000.1 X.3 MI
## 1  19.02.2016  NA  20,00      00.00.0000    00.00.0000  NA MI
## 2  19.02.2016  NA  20,00      00.00.0000    00.00.0000  NA MI
## 3  19.02.2016  NA  20,00      00.00.0000    00.00.0000  NA MI
## 4  19.02.2016  NA  20,00      00.00.0000    00.00.0000  NA MI
## 5  19.02.2016  NA  20,00      00.00.0000    00.00.0000  NA MI
## 6  19.02.2016  NA  20,00      00.00.0000    00.00.0000  NA MI

Similar to last section the column names need to be added manually. Moreover some of the values are coded (For e.g. )

rm(op) #removing op

oi<-read.csv("op.csv", sep=";", quote = "", header = F,
             col.names = c(
               "sap_client",
               "contract_id",
               "main_tran",
               "sub_tran",
               "contract_type",
               "division",
               "deposit",
               "doc_date",
               "post_date",
               "currency",
               "due_date",
               "dunn_exclu",
               "amount",
               "payment_meth",
               "clearing_date",
               "clearing_post_date",
               "clearing_reason",
               "document_type"
             ))
# Since some of the columns are coded (for e.g. 001 for migration), we need to add the corresponding text. To do that, some documents have been created with the codes and their corresponding name/text. These data will be matched with the open items data.
doc_type<-read.csv("documenttypes.csv")
tran_type<-read.csv("trantype.csv")
tran_type<-
tran_type%>%
  mutate(trantype=paste0(HVORG,"_",TVORG))%>% # Joining the two columns for easier matching with open items data.
  select(c(4,3))%>%
  rename(trandesc=TXT30)  # Renaming a column to human readable
# unique(oi$dunn_exclu)


oi<-
  oi%>%
  mutate(contract_id=as.character(contract_id))%>%  # Contract ID needs to be character. We also need to match the format with the contracts table, in case we join the tables later.
  mutate(main_tran=as.factor(main_tran))%>%  # Main transaction code should be factor, not number
  mutate(sub_tran=as.factor(sub_tran))%>% # Sub transaction n=code should be factor, not number
  mutate(contract_type=as.factor(contract_type))%>% # Contract type should be factor
  mutate(division=as.factor(division))%>%  # Division should be factor
  mutate(deposit=ifelse(deposit=="X",T,F))%>% # Converting deposit to logical T/F instead of X and not
  mutate(doc_date=dmy(doc_date))%>% # Date should be date
  mutate(post_date=dmy(post_date))%>% # Date should be date
  mutate(due_date=dmy(due_date))%>% #Date should be date
  mutate(payment_meth=as.factor(payment_meth))%>% # payment method should be factor
  mutate(clearing_date=dmy(clearing_date))%>% # Date should be in date format
  mutate(clearing_post_date=dmy(clearing_post_date))%>% # Date should be in date format
  mutate(clearing_reason=as.factor(clearing_reason))%>% # Clearing reason should be factor
  mutate(document_type=as.factor(document_type)) # document type should be factor
 
# The amount column has some issues. Firstly the decimal demarker is comma. It did not convert into number despite mentioning while importing that decimal demarer is comma. That is because there is a "-" after the digits if the number is negative and R took it as text. So, now, I will resolve this issue.
oi<-  
oi%>%
  mutate(negative=ifelse(grepl(pattern="-$",x=amount),T,F))%>% # Create a column to mark negative transactions by identifying pattern in amount (ending with "-")
  mutate(amount=gsub(pattern="[^0-9\\,]",replacement = "", x=amount))%>% # Removing all the commas that are not used as decimal demarker
  mutate(amount=gsub(pattern="[,]",replacement = ".", x=amount))%>% # Replacing the decimal demarker comma with point
  mutate(amount=as.numeric(amount))%>% # Converting to numeric
  mutate(amount=ifelse(negative,amount*-1,amount)) # If negative marked then changing sign of amount
# Now the coded columns will be filled with actual value 
oi<-
  oi%>%
  mutate(contract_type=case_when(
    contract_type == "1" ~ "Residential",
    contract_type == "2" ~ "Commercial",
    contract_type == "3" ~ "Industrial",
    contract_type == "4" ~ "Street Light",
    contract_type == "5" ~ "Government",
    contract_type == "6" ~ "Own",
    contract_type == "7" ~ "Waste"
  ))%>% # Contract type changed manually
  left_join(doc_type, by=c("document_type"="Document.Type"))%>% # For document type, joined the corresponding table
  rename(documen_desc=Description) # Renamed the column for convenience

# Did similar thing for transaction type
oi<-  
oi%>%
    mutate(trantype=paste0(main_tran,"_",sub_tran))%>% 
    left_join(tran_type, by="trantype")
 

# Now from the contracts data, we prepare a list of active and inactive contracts
active_bp_data<-
  contracts%>%
  filter(active)%>%
  select(3:4)%>%unique()
inactive_bp_data<-
  contracts%>%
  filter(!active)%>%
  select(3:4)%>%unique()

# Now I will identify the business partners who have only active account, only inactive accounts, both active and inactive accounts. I will do that by joining the active and inactive contracts table using the business partner id as the common key.

bp_status<-
active_bp_data%>%
  full_join(inactive_bp_data, by="business_partner", suffix=c(".active",".inactive"))%>%
    mutate(status= case_when(
    is.na(bp_category.active) & !is.na(bp_category.inactive) ~ "Only Inactive",
    !is.na(bp_category.active) & is.na(bp_category.inactive) ~ "Only Active",
    !is.na(bp_category.active) & !is.na(bp_category.inactive) ~ "Both Active and Inactive",
    TRUE ~ "Not Sure"
  )
  )%>%
  mutate(bp_category=ifelse(is.na(bp_category.active), as.character(bp_category.inactive), as.character(bp_category.active)))%>%
   select(1,5,4)



# Now I will join the above data with the contracts table. So, the contracts table will now have information about whether the contract belongs to a business partner with just active account, just inactive account or both.
 contracts<-
 contracts%>%
   left_join(bp_status, by="business_partner")%>%
   select(-bp_category.y)%>%
   rename(bp_category=bp_category.x)
 
 # Similarly, we create another data set with business partner details and indication on the whether the business partner has only active, only inactive or both types of contracts.
bp_full<-
 contracts%>%
   select(business_partner,bp_category, bp_org_start,
          bp_liquidation, bp_title, bp_gender, bp_marital, bp_job, bp_dob, bp_dod, bp_nationality,
          bp_legalform, bp_legalentity, bp_industry, bp_accclass, bp_collecstrategy,
          bp_lock_dunning, penalty_waiver_code, special_handling_code, bp_phone, bp_email,
          bp_comm_method, building_no, room_no, street, house_no, region)%>%
   left_join(bp_status, by="business_partner")

# Checking whether the unique business partners match in bp_status and contracts
setdiff(unique(bp_status$business_partner), unique(contracts$business_partner))
## character(0)
# Checking whether the contract accounts in the contracts and open item data set are same
setdiff(unique(oi$contract_id), unique(contracts$contract_account))
##  [1] "15069003"     "14534176"     "15024601"     "15062036"     "15107713"    
##  [6] "15013957"     "15123743"     "15165636"     "15220225"     "15104003"    
## [11] "15179894"     "15139541"     "15155265"     "15178914"     "15206586"    
## [16] "15065057"     "15089476"     "15171892"     "15183324"     "15185752"    
## [21] "15227402"     "221010010000" "15135551"     "15024457"     "15164914"    
## [26] "15173211"     "15175145"     "15175392"     "15178472"     "15176722"    
## [31] "15185026"     "15089837"     "15067566"     "15164984"     "15250815"    
## [36] "15212794"     "15160736"     "15247396"     "15186695"     "15230843"    
## [41] "15269295"     "15252447"     "15258843"     "15174946"     "15248542"    
## [46] "15264222"     "15264233"     "15252403"     "15237272"     "15229557"    
## [51] "15276623"     "15266521"     "15268164"     "15253777"     "15242787"
# Checking if the active and inactive bp data set hs all the business partners that contract dataset has.
setdiff(unique(c(active_bp_data$business_partner,inactive_bp_data$business_partner)),unique(contracts$business_partner))
## character(0)

Looks like the contracts accounts in open items data and contracts data do not match exactly. i.e. there are some contracts in open items data that does not exist in contract account.

So, we will ignore those contract ids while joining the information related to business partner status in the open items data.

# We also add business partner status information in open items data
oi<-
 contracts%>%
   select(2,3,4, 41, 42)%>%
   inner_join(oi, by=c("contract_account"="contract_id"))

Now open items and master data are cleaned.

bp_full%>%
  mutate(bp_phone=ifelse(bp_phone=="X",T,F), bp_email=ifelse(bp_email=="X",T,F),
         address=tolower(paste(building_no,room_no, street, house_no, region, sep = "_")))%>%
  select(business_partner,bp_category.x, bp_phone, bp_email, address)%>%
  rename(bp_category=bp_category.x)%>%
  group_by(business_partner, bp_category)%>%
  summarise(bp_phone=sum(bp_phone), bp_email=sum(bp_email), address=paste(address, sep="/"))%>%
  unique()%>%
  ungroup()%>%
  mutate(multiple_address=ifelse(grepl(pattern="/",x=address), T, F))
## # A tibble: 121,046 x 6
##    business_partner bp_category  bp_phone bp_email address      multiple_address
##    <chr>            <fct>           <int>    <int> <chr>        <lgl>           
##  1 1301000000       Person              1        1 4335_pp_kay… FALSE           
##  2 1301000001       Person              0        0 7606_pp_lel… FALSE           
##  3 1301000002       Person              0        0 6815_pp_mah… FALSE           
##  4 1301000003       Organization        1        0 6227_pp_pis… FALSE           
##  5 1301000004       Organization        0        0 7032_pp_kay… FALSE           
##  6 1301000005       Organization        1        0 5223_pp_gar… FALSE           
##  7 1301000006       Organization        2        0 6036_pp_rem… FALSE           
##  8 1301000007       Organization       32        0 6207_wa_roo… FALSE           
##  9 1301000007       Organization       32        0 6023_fa_pat… FALSE           
## 10 1301000007       Organization       32        0 __arowakenw… FALSE           
## # … with 121,036 more rows

Analysis

Step 0

Adding the outstanding age

oi<-
  oi %>% 
  mutate(OUT_AGE=as.numeric((today()-due_date)))%>%
  mutate(AGE_BUCKET=factor(case_when(
    OUT_AGE<= 0 ~ "in future",
    OUT_AGE<= 30 ~ "0-30 days",
    OUT_AGE<= 60 ~ "31-60 days",
    OUT_AGE<= 90 ~ "61-90 days",
    OUT_AGE<= 180 ~ "91-180 days",
    OUT_AGE<= 365 ~ "181-365 days",
    OUT_AGE<= 730 ~ "1-2 years",
    OUT_AGE<= 1095 ~ "2-3 years",
    OUT_AGE<= 1460 ~ "3-4 years",
    OUT_AGE<= 1825 ~ "4-5 years",
    OUT_AGE> 1825 ~ "5+ years",
  ), levels = c("in future","0-30 days","31-60 days", "61-90 days", "91-180 days", "181-365 days", "1-2 years", "2-3 years", "3-4 years", "4-5 years",
                "5+ years")))

Identifying transactions that are deposits, Installments, waste or other (real)

oi<-
  oi%>%
   mutate(o.type= case_when(
    deposit == T ~ "Deposit",
    documen_desc == "Installments" ~ "Installments",
    contract_type=="Waste" | grepl("Waste",trandesc) ~ "Waste",
    TRUE ~ "Real"
  ))

Usually, the installments should be due in future. So, Will filter out transactions that are in future. After that, the transactions marked as installments must be actual outstanding. Then will also filter out waste.

oi<-
  oi %>% 
  filter(OUT_AGE!="in future") %>% 
  filter(o.type !="Waste")

Deposits need a separate treatment and should not be included in the transaction.

oi<-oi %>% 
  filter(o.type !="Deposit")
deposits<-oi %>% 
  filter(o.type =="Deposit")

List of contracts with inactive business partners

The list can be downloaded from below.

allinactive<-oi %>% 
  filter(status=="Only Inactive") 
allinactive %>% 
download_this(
    button_label = "Download Inactive",
    button_type = "primary",
    output_extension = ".xlsx",
    has_icon = TRUE,
    icon = "fa fa-save",
    self_contained=T)

Identify >5 years outstanding

fiveplus<-oi %>% 
  filter(AGE_BUCKET=="5+ years") # Filter to select only 5 + year old transactions
fiveplus %>% 
  group_by(status) %>% 
  summarise(count=n())
## # A tibble: 3 x 2
##   status                    count
##   <chr>                     <int>
## 1 Both Active and Inactive  29364
## 2 Only Active              173476
## 3 Only Inactive            307580

Missing data

oi%>%
  mutate(act.status=ifelse(is.na(active),"Not Known",active))%>%
  group_by(act.status)%>%
    summarise(amount=sum(amount))%>%
  ggplot(aes(x=amount, y=reorder(act.status,-amount), label=paste0(round(amount/1000000), " M")))+
  geom_col(fill="dodgerblue4")+
  geom_text(aes(x=ifelse(amount<0, amount+5000000, amount-5000000)), colour="white", angle =90)+
  labs(title = "Open items by contract active status",
       x="",
       y="",
       subtitle = "Not known because data missing from master data",
       caption = "Source: Download from SAP")+
  theme_classic()+
  #theme(axis.text.x = element_text(angle = 90))+
  coord_cartesian(clip = "off")+
  annotation_custom(l, ymin = 2.75, ymax = 3.75, xmin=1.5e+08,xmax=2e+08)